{'(task)': {'task': 'default',
            'batch_by_len*': True,
            '(loss)': {'loss*': 'chat', 'predict_threshold': None},
            '(model)': {'model*': 'fidt5chat',
                        'pretrained_model*': 'TBD',
                        'from_tf': False,
                        '(ChatBase)': {'gradient_checkpointing*': True},
                        '(huggingface)': {'auto_model*': 'google/mt5-xl'},
                        '(pretrained_model)': {'strict_size': True}},
            '(dataset)': {'dataset': 'default', 'dataset_format': 'torch', '(sampler)': {'sampler': 'default'}}},
 '(trainer)': {'data_dir*': 'chat_pipeline/save_dir/data/',
               'save_dir*': 'TBD',
               'overwrite': False,
               'resume': False,
               'max_epoch*': 90,
               'max_update': None,
               'batch_size*': 960,
               'update_freq': 1,
               'auto_ga': False,
               'max_loss': 200.0,
               'min_lr': 0.0,
               'seed': 1,
               'cache_train_file': True,
               'lazy_load*': True,
               'data_size*': {'dev': 2000,
                              'train_0': 200000,
                              'train_1': 200000,
                              'train_2': 200000,
                              'train_3': 200000,
                              'train_4': 200000,
                              'train_5': 75948},
               'ema_decay': 1.0,
               'train_subset*': 'train*',
               'exclude_valid_from_train': False,
               'train_steps': None,
               'auto_suffix': False,
               '(log)': {'log_interval': 10, 'log_file': 'log.txt', 'viz_ref_dir': None, 'figext': 'png'},
               '(fp16)': {'fp16': False},
               '(bf16)': {'bf16': False},
               '(evaluation)': {'eval_interval*': 1000,
                                'save': True,
                                'tolerance': None,
                                'min_steps': 1,
                                'major_metric*': 'loss',
                                'ascending_metric*': False,
                                'eval_interval_warmup*': 50,
                                'valid_subset': 'dev',
                                'max_eval_steps': None,
                                'inspect_gradient': False,
                                '(save)': {'save_best_only': False,
                                           'save_above_score': None,
                                           'save_full_checkpoint': False,
                                           'save_last_only': False},
                                '(eval_interval_warmup)': {'eval_interval_warmup_mutiplier': 20.0}},
               '(distributed)': {'cuda': True,
                                 'distributed_world_size*': 64,
                                 'distributed_rank': 0,
                                 'device_id*': 0,
                                 '(cuda)': {'distributed_init_method': 'env://',
                                            'empty_cache_freq': 0,
                                            '(distributed_init_method)': {'distributed_backend': 'nccl',
                                                                          'bucket_cap_mb': 25,
                                                                          'ddp_backend': 'no_c10d',
                                                                          'use_bmuf': False}}}},
 '(processor)': {'processor*': 't5_fidchat_instruction',
                 'max_len*': 380,
                 'min_len': 1,
                 'pad_index*': 0,
                 'pad_word*': '<pad>',
                 'unk_word*': '<unk>',
                 'vocab_size*': 250100,
                 'num_classes*': 1,
                 'target_unk': None,
                 'max_encoder_length*': 380,
                 'max_decoder_length*': 512,
                 'max_n_passage': 20},
 '(deepspeed)': {'deepspeed_save_dir*': '/mnt/workspace/workgroup/hehong.chh/experiment/mt5_xl/sft/v2.6.0_epoch15_lr1e-4_bs960/ds_states',
                 'deepspeed_zero_stage*': 2,
                 'deepspeed_bf16*': True,
                 'deepspeed_fp16': False},
 '(optimization)': {'learning_rate*': 0.0001,
                    'clip_norm': 5,
                    '(optimizer)': {'optimizer': 'adam',
                                    '(adam_optimizer)': {'adam_betas': [0.9, 0.999],
                                                         'adam_eps': 1e-06,
                                                         'weight_decay*': 0.01}},
                    '(lr_scheduler)': {'lr_scheduler*': 'one_cycle',
                                       'warmup_steps*': 1800,
                                       'anneal_strategy': 'linear',
                                       'div_factor': 600.0,
                                       'cycle_momentum': False}}}